## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
## Loading required package: lattice
## Type 'citation("pROC")' for a citation.
## 
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
## 
##     cov, smooth, var
## Registered S3 methods overwritten by 'registry':
##   method               from 
##   print.registry_field proxy
##   print.registry_entry proxy
## 
## Attaching package: 'seriation'
## The following object is masked from 'package:lattice':
## 
##     panel.lines
## Loading required package: pdc
## Registered S3 method overwritten by 'quantmod':
##   method            from
##   as.zoo.data.frame zoo
# Create a vector containing all attribute names
attribute.names = c()
for(pixel.number in 1:9) {
  for(attribute.suffix in c("Red", "Green", "NIR1", "NIR2")) {
    attribute.name = paste("p", pixel.number, ".", attribute.suffix, sep="")
    attribute.names = c(attribute.names, attribute.name)
  }
}
attribute.names = c(attribute.names, "class")

# Create a vector containing all class names
class.names = c("red soil", "cotton crop", "grey soil", "damp grey soil", "soil with vegetation stubble", "very damp grey soil")

# Load the dataset from the csv file and cast the label to factor
dataset = data.frame(read.csv("dataset.csv", col.names = attribute.names, sep=" "))
dataset$class = factor(dataset$class, labels = class.names)

# Determine the input and the output space
dataset.input = dataset[, 1:ncol(dataset)-1]
dataset.output = dataset$class
# Determine some statistical measures on the input attributes
dataset.mean = sapply(dataset.input, mean)
dataset.sd = sapply(dataset.input, sd)
dataset.var = sapply(dataset.input, var)
dataset.min = sapply(dataset.input, min)
dataset.max = sapply(dataset.input, max)
dataset.median = sapply(dataset.input, median)
dataset.range = sapply(dataset.input, range)
dataset.quantile = sapply(dataset.input, quantile)
# Determine class frequencies
class.frequencies = data.frame(table(dataset.output))
colnames(class.frequencies) = c("class", "frequencies")
class.frequencies$percentages = paste(format((class.frequencies$frequencies / sum(class.frequencies$frequencies)) * 100, digits = 2), "%", sep = "")
ggplot(data = class.frequencies, aes(x = class, y = frequencies, fill = class)) + geom_bar(stat = "identity") + geom_text(aes(label = percentages), vjust = 1.5, colour = "white") + theme(axis.text.x=element_blank())

ggplot(data = class.frequencies, aes(x = "", y = frequencies, fill = class)) + geom_bar(stat = "identity", width = 1, color = "white") + coord_polar("y", start = 0)

## Separabilità I valori relativi allo stesso canale tra pixel adiacenti risultano correlati (come evidente dai grafici), questo perche’ il dataset e’ costruito come una sliding window quadrata 3x3 su una singola immagine satellitare. Ci si aspetta quindi una varianza piuttosto contenuta sui singoli valori e la sopra citata correlazione.

TODO: Aggiungere titoli ai featureplot

featurePlot(dataset.input[, c("p5.Red", "p5.Green", "p5.NIR1")], dataset.output, plot="pairs", scales=list((relation="free"), y=list(relation="free")), auto.key=list(columns=3))

featurePlot(dataset.input[, c("p5.Red", "p6.Red", "p4.Red")], dataset.output, plot="pairs", scales=list((relation="free"), y=list(relation="free")), auto.key=list(columns=3))

featurePlot(dataset.input[, c("p5.Green", "p6.Green", "p4.Green")], dataset.output, plot="pairs", scales=list((relation="free"), y=list(relation="free")), auto.key=list(columns=3))

featurePlot(dataset.input[, c("p5.Green", "p2.Green", "p8.Green")], dataset.output, plot="pairs", scales=list((relation="free"), y=list(relation="free")), auto.key=list(columns=3))

featurePlot(dataset.input[, c("p5.Red", "p2.Red", "p8.Red")], dataset.output, plot="pairs", scales=list((relation="free"), y=list(relation="free")), auto.key=list(columns=3))

featurePlot(dataset.input[, c("p5.NIR1", "p2.NIR1", "p8.NIR1")], dataset.output, plot="pairs", scales=list((relation="free"), y=list(relation="free")), auto.key=list(columns=3))

PCA

Data la correlazione rilevata precedentemente e l’alto numero di attributi in input sembra più che opportuno eseguire una PCA sul dataset al fine di ridurre la dimensione dello spazio di input.

# Execute the PCA. Extract only the first four principal components
pca.results <- PCA(dataset.input, scale.unit = TRUE, ncp = 4, graph = FALSE)

# Get the eigenvalues and plot them. The first four components explain about 92% of variance
pca.results.eig.val <- get_eigenvalue(pca.results)
fviz_eig(pca.results, addlabels = TRUE, ylim = c(0, 50))

# Show the correlation between the variables using the first two principal components
# Many variable are positively correlated
fviz_pca_var(pca.results, col.var = "black")

Analizzando le coordinate per i singoli attributi si riesce a capire i 4 cluster rappresentano i 4 tipi di variabili in gioco (Red, Green NIR1, NIR2)

# Extract the principal components
dataset.input.pca = data.frame(get_pca_ind(pca.results)$coord)
dataset.pca = cbind(class=dataset$class, dataset.input.pca)

# Draw some feature plot to highlight the level of difficulty to recognise the various classes
featurePlot(x=dataset.input.pca, y=dataset.output, plot="density", scales=list(x=list(relation="free"), y=list(relation="free")), auto.key=list(columns=3))

featurePlot(x=dataset.input.pca, y=dataset.output, plot="pairs", auto.key=list(columns=3))

# Divide the dataset into trainset and testset
indexes = sample(2, nrow(dataset.pca), replace = TRUE, prob = c(0.7, 0.3))
temp.trainset = dataset.pca[indexes == 1,]
testset = dataset.pca[indexes == 2,]
testset.x = testset[, !names(testset) %in% c("class")]
testset.y = testset[, c("class")]

# Divide the trainset to obtain a validationset
indexes = sample(2, nrow(temp.trainset), replace = TRUE, prob = c(0.9, 0.1))
trainset = temp.trainset[indexes == 1,]
validationset = temp.trainset[indexes == 2,]
validationset.x = validationset[, !names(validationset) %in% c("class")]
validationset.y =  validationset[, c("class")]
# Train a SVM model and tune the hyperparameters (cost and gamma)
svm.tuning.control = tune.control(sampling = "fix") # To use validationset instead
svm.tuning = tune.svm(class ~ Dim.1 + Dim.2 + Dim.3 + Dim.4, data = trainset, kernel = "radial", gamma = 10^(-3:1), cost = 10^(-2:1), validation.x = validationset.x, validation.y = validationset.y, tunecontrol = svm.tuning.control, probability = TRUE)
plot(svm.tuning)

# Determine some metrics for each class
svm.tuned = svm.tuning$best.model
svm.tuned.prediction = predict(svm.tuned, testset.x)
svm.tuned.result = confusionMatrix(svm.tuned.prediction, testset.y, mode = "prec_recall")

# TODO: how is calculated? avg, micro-avg, macro-avg?
sprintf("Tuned SVM global accuracy is %2.1f%%", svm.tuned.result$overall["Accuracy"] * 100)
## [1] "Tuned SVM global accuracy is 88.4%"
print(svm.tuned.result$table)
##                               Reference
## Prediction                     red soil cotton crop grey soil damp grey soil
##   red soil                          460           0         7              1
##   cotton crop                         0         190         2              1
##   grey soil                           2           1       369             29
##   damp grey soil                      1           1        13            108
##   soil with vegetation stubble        6           9         0              5
##   very damp grey soil                 0           2         9             38
##                               Reference
## Prediction                     soil with vegetation stubble very damp grey soil
##   red soil                                               11                   0
##   cotton crop                                             6                   0
##   grey soil                                               3                   9
##   damp grey soil                                          1                  39
##   soil with vegetation stubble                          181                  10
##   very damp grey soil                                    18                 406
# Train a NNET model and tune the hyperparameters (size and decay)
nnet.tuning.control = tune.control(sampling = 'fix')
nnet.tuning = tune.nnet(class ~ Dim.1 + Dim.2 + Dim.3 + Dim.4, data = trainset, size = c(2:4), validation.x = validationset.x, validation.y = validationset.y, decay = c(0, 2^-2:1), probability = TRUE)
plot(nnet.tuning)

# Determine some metrics for each class
nnet.tuned = nnet.tuning$best.model
nnet.tuned.prediction = factor(predict(nnet.tuned, testset.x, type = "class"), levels = class.names)
nnet.tuned.result = confusionMatrix(nnet.tuned.prediction, testset.y, mode = "prec_recall")

# TODO: how is calculated? avg, micro-avg, macro-avg?
sprintf("Tuned NNET global accuracy is %2.1f%%", nnet.tuned.result$overall["Accuracy"] * 100)
## [1] "Tuned NNET global accuracy is 84.9%"
print(nnet.tuned.result$table)
##                               Reference
## Prediction                     red soil cotton crop grey soil damp grey soil
##   red soil                          459           2         6              1
##   cotton crop                         0         185         1              0
##   grey soil                           1           0       364             33
##   damp grey soil                      1           0        26             81
##   soil with vegetation stubble        8          16         1              8
##   very damp grey soil                 0           0         2             59
##                               Reference
## Prediction                     soil with vegetation stubble very damp grey soil
##   red soil                                               14                   0
##   cotton crop                                             7                   0
##   grey soil                                               0                   8
##   damp grey soil                                          1                  52
##   soil with vegetation stubble                          168                  15
##   very damp grey soil                                    30                 389

Confronto modelli

Calcolo metriche

In questa sezione vengono calcolate e confrontate metriche di base come sensitivity, specificity e F-scores.

metrics_of_interest = c("Precision", "Specificity", "F1")
svm.macro.fscore = Reduce('+', svm.tuned.result$byClass[,"F1"]) / length(class.names)
svm.micro.fscore = Reduce('+', svm.tuned.result$byClass[,"F1"] * svm.tuned.result$byClass[,"Prevalence"])

sprintf("SVM macro F-Score: %1.3f", svm.macro.fscore)
## [1] "SVM macro F-Score: 0.859"
sprintf("SVM micro F-Score: %1.3f", svm.micro.fscore)
## [1] "SVM micro F-Score: 0.883"
print(svm.tuned.result$byClass[, metrics_of_interest])
##                                     Precision Specificity        F1
## Class: red soil                     0.9603340   0.9870660 0.9704641
## Class: cotton crop                  0.9547739   0.9948127 0.9452736
## Class: grey soil                    0.8934625   0.9713914 0.9077491
## Class: damp grey soil               0.6625767   0.9686788 0.6260870
## Class: soil with vegetation stubble 0.8578199   0.9825378 0.8399072
## Class: very damp grey soil          0.8583510   0.9545455 0.8665955
# Determine some metrics for each class
nnet.macro.fscore = Reduce('+', nnet.tuned.result$byClass[,"F1"]) / length(class.names)
nnet.micro.fscore = Reduce('+', nnet.tuned.result$byClass[,"F1"] * nnet.tuned.result$byClass[,"Prevalence"])
sprintf("NNET macro F-Score: %1.3f", nnet.macro.fscore)
## [1] "NNET macro F-Score: 0.812"
sprintf("NNET micro F-Score: %1.3f", nnet.micro.fscore)
## [1] "NNET micro F-Score: 0.847"
sprintf("Tuned NNET metrics by class:")
## [1] "Tuned NNET metrics by class:"
nnet.tuned.result$byClass[, metrics_of_interest]
##                                     Precision Specificity        F1
## Class: red soil                     0.9522822   0.9843431 0.9652997
## Class: cotton crop                  0.9585492   0.9953890 0.9343434
## Class: grey soil                    0.8965517   0.9726918 0.9032258
## Class: damp grey soil               0.5031056   0.9544419 0.4723032
## Class: soil with vegetation stubble 0.7777778   0.9720605 0.7706422
## Class: very damp grey soil          0.8104167   0.9382632 0.8241525
metrics.comparison = svm.tuned.result$byClass[, metrics_of_interest] - nnet.tuned.result$byClass[, metrics_of_interest]
print("Metrics compared, positive values favor SVM")
## [1] "Metrics compared, positive values favor SVM"
print(metrics.comparison)
##                                        Precision   Specificity          F1
## Class: red soil                      0.008051872  0.0027229408 0.005164450
## Class: cotton crop                  -0.003775353 -0.0005763689 0.010930197
## Class: grey soil                    -0.003089254 -0.0013003901 0.004523271
## Class: damp grey soil                0.159471097  0.0142369021 0.153783750
## Class: soil with vegetation stubble  0.080042127  0.0104772992 0.069264991
## Class: very damp grey soil           0.047934285  0.0162822252 0.042442975

Curve ROC

# Predicts the most probable class class for each instance
svm.tuned.predictions = predict(svm.tuned, testset.x, probability = TRUE)

# Extracts, for each class, the probability of every instance of being of that class 
svm.tuned.predictions.probs = attr(svm.tuned.predictions, "probabilities")

# Extracts, for each class, the probability of every instance of being of that class
nnet.tuned.predictions = predict(nnet.tuned, testset.x, probability = TRUE)

# Creates the dataframe to be used for multiroc
testset.multiroc = data.frame(matrix(ncol=18, nrow=0))

for (i in 1:length(testset.y)) {
    row = matrix(0, 1, 6)
    # sets 1 to the correct class 
    row[testset.y[i]] = 1
    for (j in 1:6)
        # concatenates the probabilities of the svm
        row = c(row, svm.tuned.predictions.probs[i, class.names[j]])
    # concatenates the probabilities of the neural net
    row = c(row, nnet.tuned.predictions[i,])
    testset.multiroc = rbind(testset.multiroc, row)
}

# Formats the dataframe to be suitable for the multi_roc function
colnames(testset.multiroc) = c('red_soil_true', 'cotton_crop_true', 'grey_soil_true', 'damp_grey_soil_true', 'vegetation_true', 'very_damp grey_soil_true', 'red_soil_pred_SVM', 'cotton_crop_pred_SVM', 'grey_soil_pred_SVM', 'damp_grey_soil_pred_SVM', 'vegetation_pred_SVM', 'very_damp grey_soil_pred_SVM', 'red_soil_pred_NN', 'cotton_crop_pred_NN', 'grey_soil_pred_NN', 'damp_grey_soil_pred_NN', 'vegetation_pred_NN', 'very_damp grey_soil_pred_NN')

svm.nnet.multiroc = multi_roc(testset.multiroc, force_diag=T)
## Warning in regularize.values(x, y, ties, missing(ties), na.rm = na.rm):
## collapsing to unique 'x' values

## Warning in regularize.values(x, y, ties, missing(ties), na.rm = na.rm):
## collapsing to unique 'x' values

## Warning in regularize.values(x, y, ties, missing(ties), na.rm = na.rm):
## collapsing to unique 'x' values

## Warning in regularize.values(x, y, ties, missing(ties), na.rm = na.rm):
## collapsing to unique 'x' values

## Warning in regularize.values(x, y, ties, missing(ties), na.rm = na.rm):
## collapsing to unique 'x' values

## Warning in regularize.values(x, y, ties, missing(ties), na.rm = na.rm):
## collapsing to unique 'x' values

## Warning in regularize.values(x, y, ties, missing(ties), na.rm = na.rm):
## collapsing to unique 'x' values

## Warning in regularize.values(x, y, ties, missing(ties), na.rm = na.rm):
## collapsing to unique 'x' values

## Warning in regularize.values(x, y, ties, missing(ties), na.rm = na.rm):
## collapsing to unique 'x' values

## Warning in regularize.values(x, y, ties, missing(ties), na.rm = na.rm):
## collapsing to unique 'x' values

## Warning in regularize.values(x, y, ties, missing(ties), na.rm = na.rm):
## collapsing to unique 'x' values

## Warning in regularize.values(x, y, ties, missing(ties), na.rm = na.rm):
## collapsing to unique 'x' values
print("Per class AUC values:")
## [1] "Per class AUC values:"
print(unlist(svm.nnet.multiroc$AUC))
##            SVM.red_soil         SVM.cotton_crop           SVM.grey_soil 
##               0.9980333               0.9970103               0.9896131 
##      SVM.damp_grey_soil          SVM.vegetation SVM.very_damp grey_soil 
##               0.9420480               0.9834771               0.9794060 
##               SVM.macro               SVM.micro             NN.red_soil 
##               0.9815677               0.9878187               0.9971247 
##          NN.cotton_crop            NN.grey_soil       NN.damp_grey_soil 
##               0.9979614               0.9891157               0.9211714 
##           NN.vegetation  NN.very_damp grey_soil                NN.macro 
##               0.9747513               0.9726436               0.9754171 
##                NN.micro 
##               0.9846539
# MultiROC plotting
n_method <- length(unique(svm.nnet.multiroc$Methods))
n_group <- length(unique(svm.nnet.multiroc$Groups))

# changes the format of results to a ggplot2 friendly format
for (i in 1:n_group) {
      res_df <- data.frame(Specificity= numeric(0), Sensitivity= numeric(0), Group = character(0), AUC = numeric(0), Method = character(0))
      for (j in 1:n_method) {
        temp_data_1 <- data.frame(Specificity=svm.nnet.multiroc$Specificity[[j]][i],
                                  Sensitivity=svm.nnet.multiroc$Sensitivity[[j]][i],
                                  Group=tools::toTitleCase(gsub("_", " ", unique(svm.nnet.multiroc$Groups)[i])),
                                  AUC=svm.nnet.multiroc$AUC[[j]][i],
                                  Method = unique(svm.nnet.multiroc$Methods)[j])
        colnames(temp_data_1) <- c("Specificity", "Sensitivity", "Group", "AUC", "Method")
        res_df <- rbind(res_df, temp_data_1)

      }
      
      print(ggplot2::ggplot(res_df, ggplot2::aes(x = 1-Specificity, y=Sensitivity)) + ggplot2::geom_path(ggplot2::aes(color = Group, linetype=Method)) + ggplot2::geom_segment(ggplot2::aes(x = 0, y = 0, xend = 1, yend = 1), colour='grey', linetype = 'dotdash') + ggplot2::theme_bw() + ggplot2::theme(plot.title = ggplot2::element_text(hjust = 0.5), legend.justification=c(1, 0), legend.position=c(.95, .05), legend.title=ggplot2::element_blank(), legend.background = ggplot2::element_rect(fill=NULL, size=0.5, linetype="solid", colour ="black")))
    
}

Clustering

trainset.clustering = dataset.input.pca
labels = dataset.pca[1]

# dataframe to vector conversion
labels.list <- c()
labels.list = match(as.factor(labels$class), class.names)
set.seed(1)
nk = 2:12
# calculates average silhouette values for an ascending value of k
SW = sapply(nk, function(k){cluster.stats(dist(trainset.clustering), 
                              kmeans(trainset.clustering, centers=k, nstart=5)$cluster)$avg.silwidth})
#SW
plot(nk, SW, type="l", xlab="number of clusters", ylab="average silhouette")

K=6

Pur ottenendo una misura minore di Silhouette in confronto a k=3, e’ stato comunque ritenuto opportuno analizzare i risultati ottenuti tramite l’esecuzioe con parametro k=6, che rappresenta il numero delle classi, e di conseguenza porta ad una maggiore misura di similarita’ tra i cluster rilevati da k-means e l’effettiva suddivisione in classi del dataset

fit = kmeans(trainset.clustering, 6, nstart=5)

# calculates silhouette values for k=6
kms = silhouette(fit$cluster, dist(trainset.clustering))
plot(kms, col=1:6, border=NA, main = "Silhouette Plot for k=6")

# Dissimilarity Matrix
dissplot(dist(trainset.clustering), labels=fit$cluster, options=list(main="Kmeans clustering with k=6"))

# Evaluation
sprintf("Similarity with (k=6): %f", cluster.evaluation(labels.list, fit$cluster))
## [1] "Similarity with (k=6): 0.689679"
# visualization
fviz_cluster(fit, trainset.clustering, ellipse.type = "norm", axes=c(1,2), main="Dimensions: 1 & 2", 
             geom="point")

fviz_cluster(fit, trainset.clustering, ellipse.type = "norm", axes=c(1,3), main="Dimensions: 1 & 3", 
             geom="point")

fviz_cluster(fit, trainset.clustering, ellipse.type = "norm", axes=c(1,4), main="Dimensions: 1 & 4", 
             geom="point")

fviz_cluster(fit, trainset.clustering, ellipse.type = "norm", axes=c(2,3), main="Dimensions: 2 & 3", 
             geom="point")

fviz_cluster(fit, trainset.clustering, ellipse.type = "norm", axes=c(2,4), main="Dimensions: 2 & 4", 
             geom="point")

fviz_cluster(fit, trainset.clustering, ellipse.type = "norm", axes=c(3,4), main="Dimensions: 3 & 4", 
             geom="point")

# Confusion Matrix
source("clustering_confusion_matrix.R")

confusion_matrix = clustering_confusion_matrix(fit$cluster, labels.list, 6, 6)

for (i in 1:6) {
    pie_title = sprintf("Cluster %d Distribution", i)
    pie(confusion_matrix[i,], main=pie_title, labels=class.names, col=rainbow(6))
}

#
for (i in 1:6) {
    pie_title = sprintf("Class %s Distribution", class.names[i])
    pie(confusion_matrix[,i], main=pie_title, labels=1:6, col=rainbow(6))
}

K=3

Nonostante il valore di Silhouette maggiore rispetto a k=6, il parametro k=3 porta ad un peggioramento nella misura di similarita’ tra cluster rilevati e classi del dataset

fit = kmeans(trainset.clustering, 3, nstart=5)

# calculates silhouettes values for k=3
kms = silhouette(fit$cluster, dist(trainset.clustering))
plot(kms, col=1:3, border=NA, main = "Silhouette Plot for k=3")

# Dissimilarity Matrix
dissplot(dist(trainset.clustering), labels=fit$cluster, options=list(main="Kmeans clustering with k=3"))

# Evaluation
sprintf("Similarity with (k=3): %f", cluster.evaluation(labels.list, fit$cluster))
## [1] "Similarity with (k=3): 0.529782"
# visualization
fviz_cluster(fit, trainset.clustering, ellipse.type = "norm", axes=c(1,2), main="Dimensions: 1 & 2", 
             geom="point")

fviz_cluster(fit, trainset.clustering, ellipse.type = "norm", axes=c(1,3), main="Dimensions: 1 & 3", 
             geom="point")

fviz_cluster(fit, trainset.clustering, ellipse.type = "norm", axes=c(1,4), main="Dimensions: 1 & 4", 
             geom="point")

fviz_cluster(fit, trainset.clustering, ellipse.type = "norm", axes=c(2,3), main="Dimensions: 2 & 3", 
             geom="point")

fviz_cluster(fit, trainset.clustering, ellipse.type = "norm", axes=c(2,4), main="Dimensions: 2 & 4", 
             geom="point")

fviz_cluster(fit, trainset.clustering, ellipse.type = "norm", axes=c(3,4), main="Dimensions: 3 & 4", 
             geom="point")

# Confusion Matrix
source("clustering_confusion_matrix.R")

confusion_matrix = clustering_confusion_matrix(fit$cluster, labels.list, 3, 6)

for (i in 1:3) {
    pie_title = sprintf("Cluster %d Disribution", i)
    pie(confusion_matrix[i,], main=pie_title, labels=class.names, col=rainbow(6))
}

#
for (i in 1:6) {
    pie_title = sprintf("Class %s Distribution", class.names[i])
    pie(confusion_matrix[,i], main=pie_title, labels=1:6, col=rainbow(3))
}

Evaluation al variare di k

Evidenzia la misura di similarita’ tra i cluster e le classi del dataset, al variaaare del parametro k Il valore piu’ alto e’ ottenuto con k=6, in linea con le rilevazioni precedenti

similarity = c()

# Calculates the similarity between clusters and targets for an ascending value of k
for (k in 2:6) {
    fit = kmeans(trainset.clustering, k, nstart = 5)
    eval = cluster.evaluation(labels.list, fit$cluster)
    similarity = append(similarity, eval)
}
plot(2:6, similarity, type="b", main = "Similarity plot for ascending K", xlab="K")